In [1]:
from googleapiclient.discovery import build
from dateutil import parser
import pandas as pd
from IPython.display import JSON

# Data viz packages
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# NLP
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/luqiansong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/luqiansong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
In [ ]:
# load datasets
df = pd.read_csv('/Users/luqiansong/Desktop/22201381.csv')

Task 1: Data Understanding¶

In [3]:
#getting basic information about datasets 
df.info() 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         8000 non-null   int64 
 1   category           7980 non-null   object
 2   headline           7987 non-null   object
 3   authors            6951 non-null   object
 4   link               7981 non-null   object
 5   short_description  7150 non-null   object
 6   date               7982 non-null   object
dtypes: int64(1), object(6)
memory usage: 437.6+ KB
In [4]:
# check the dataset for blank values
df.isnull().sum()
Out[4]:
Unnamed: 0              0
category               20
headline               13
authors              1049
link                   19
short_description     850
date                   18
dtype: int64
In [5]:
# found blank values exist in every columns
df.isnull().any()
Out[5]:
Unnamed: 0           False
category              True
headline              True
authors               True
link                  True
short_description     True
date                  True
dtype: bool
In [6]:
# Check data types to validate datatype
df.dtypes
Out[6]:
Unnamed: 0            int64
category             object
headline             object
authors              object
link                 object
short_description    object
date                 object
dtype: object
In [16]:
df.shape
Out[16]:
(8000, 7)
In [7]:
# check incorrect/imbalance data thourgh class count
category_counts = df['category'].value_counts()
category_counts.plot(kind='bar')
plt.title("Category Distribution")
plt.xlabel("Category")
plt.ylabel("Count")
plt.show()
In [2]:
import pandas as pd

df = pd.read_csv('/Users/luqiansong/Desktop/22201381.csv')

int_columns = ['category','headline', 'authors', 'short_description', 'date']
In [4]:
import os 
import numpy as np
import cv2
import random
import sklearn
import keras
import PIL
import matplotlib.pyplot as plt
import seaborn as sns
from keras import backend as K
from keras.layers import AveragePooling2D
from tensorflow.keras.optimizers import RMSprop
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv2D, MaxPooling2D, Flatten
from keras.callbacks import ModelCheckpoint
from sklearn import metrics
from sklearn.metrics import confusion_matrix


import tensorflow as tf
2023-07-07 13:37:36.848599: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
In [ ]:
# analyse the lenght of the sentences in each category
In [82]:
# length: analyse relationship between short_description and category
df = pd.read_csv('/Users/luqiansong/Desktop/22201381.csv')
df['headline_length'] = df['headline'].str.len()

# Group the DataFrame by column 'category' and calculate descriptive statistics for headline length
grouped_lengths = df.groupby('category')['headline_length'].describe()

# Display the descriptive statistics for headline length in each category
display(grouped_lengths)
count mean std min 25% 50% 75% max
category
POLITICS 5973.0 63.721915 17.481447 11.0 53.0 65.0 75.0 189.0
SPORTS 1994.0 62.472919 16.363946 10.0 53.0 64.0 72.0 109.0
In [83]:
# length analyse relationship between headline and category
df['short_description_length'] = df['short_description'].str.len()

# Group the DataFrame by column 'category' and calculate descriptive statistics for headline length
grouped_lengths = df.groupby('category')['short_description_length'].describe()

# Display the descriptive statistics for headline length in each category
display(grouped_lengths)
count mean std min 25% 50% 75% max
category
POLITICS 5399.0 104.167438 76.482309 1.0 64.0 92.0 122.5 1136.0
SPORTS 1731.0 88.634315 61.772038 1.0 43.0 75.0 121.0 374.0
In [9]:
# check outlier according the text length 
import pandas as pd

df = pd.read_csv('/Users/luqiansong/Desktop/22201381.csv')
# Convert float values in 'text' column to strings
df['text_length'] = df['short_description'].astype(str)
# Calculate the length of each text sample
df['text_length'] = df['short_description'].apply(lambda x: len(str(x)))

# Calculate the mean and standard deviation of text lengths
mean_length = df['text_length'].mean()
std_length = df['text_length'].std()

# Define a threshold for outlier detection 
threshold = mean_length + 3 * std_length

# Identify outliers based on text length
outliers = df[df['text_length'] > threshold]
outliers
Out[9]:
Unnamed: 0 category headline authors link short_description date text_length
3 3 POLITICS An Open Letter to My Fellow Millennials on Hil... Nick Laure, ContributorAn advocate for logical... https://www.huffingtonpost.com/entry/an-open-l... I am not asking anyone to stop supporting Bern... 2016-03-20 318
59 59 POLITICS Can You Catch It From a Caliph? Ebola, ISIS, ... M. Gregg Bloche, M.D., J.D., ContributorAuthor... https://www.huffingtonpost.com/entry/can-you-c... Two potent forces power the Ebola and ISIS epi... 2014-10-02 318
72 72 POLITICS In Defense of Christians James Zogby, ContributorPresident, Arab Americ... https://www.huffingtonpost.com/entry/in-defens... We have every reason to be concerned with the ... 2014-09-06 345
238 238 POLITICS Democrats Should Take the Megalomaniac Seriously Dave R. Jacobson, ContributorDemocratic Strate... https://www.huffingtonpost.com/entry/democrats... "I'm a unifier," said Donald Trump, the odds-o... 2016-03-13 334
540 540 POLITICS Sunday Roundup Arianna Huffington, Contributor https://www.huffingtonpost.com/entry/sunday-ro... LONDON -- This week began with the continuing ... 2014-06-01 933
... ... ... ... ... ... ... ... ...
7685 7685 POLITICS Adam Smith vs. Ayn Rand David Morris, ContributorDirector, The Public ... https://www.huffingtonpost.com/entry/adam-smit... Even with Medicare and Medicaid, tens of milli... 2015-06-02 324
7777 7777 POLITICS Sunday Roundup Arianna Huffington, Contributor https://www.huffingtonpost.com/entry/sunday-ro... This week, the nation watched Kobe Bryant say ... 2016-04-17 1067
7820 7820 POLITICS Social Security Expansion Key to Averting Reti... Ben Veghte, ContributorVice President for Poli... https://www.huffingtonpost.com/entry/social-se... Workers today need to be saving much more for ... 2015-06-03 363
7840 7840 POLITICS Sunday Roundup Arianna Huffington, Contributor https://www.huffingtonpost.com/entry/sunday-ro... This week, the nation was once again shocked b... 2015-10-04 898
7897 7897 POLITICS The Three Stooges of the Grand Obstructionist ... Lance Simmens, ContributorAuthor, "Fracktured"... https://www.huffingtonpost.com/entry/the-three... Dick Cheney, Bill O'Reilly, and Rudy Giuliani ... 2015-02-22 329

86 rows × 8 columns

In [10]:
import matplotlib.pyplot as plt

# Plot histogram of text lengths
plt.hist(df['text_length'], bins=50)
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.title('Distribution of Text Lengths')
plt.show()
In [11]:
# Plot boxplot of text lengths
plt.boxplot(df['text_length'])
plt.xlabel('Text Length')
plt.title('Boxplot of Text Lengths')
plt.show()
In [29]:
# check incorrect/imbalance data thourgh class distribution 
import matplotlib.pyplot as plt

# Calculate class distribution
class_distribution = df['category'].value_counts(normalize=True)

# Plot class distribution
plt.bar(class_distribution.index, class_distribution.values)
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.title('Class Distribution')
plt.show()

# Calculate class counts
class_counts = df['category'].value_counts()
In [12]:
# Check Value Counts and Unique Values also check for logical inconsistencies
value_counts = df['category'].value_counts()
unique_values = df['category'].unique()
In [13]:
value_counts
Out[13]:
category
POLITICS    5983
SPORTS      1997
Name: count, dtype: int64

eed

In [14]:
unique_values
Out[14]:
array(['POLITICS', 'SPORTS', nan], dtype=object)
In [43]:
# I found unnamed column , removed it
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
In [ ]:
# Analyse the infomation of short_description
In [30]:
df = pd.read_csv('/Users/luqiansong/Desktop/22201381.csv')
from wordcloud import WordCloud

# Convert float values in 'short_description' column to strings
df['short_description'] = df['short_description'].astype(str)

# Join the different processed descriptions together
long_string = ' '.join(list(df['short_description'].values))

# Create a WordCloud object
wordcloud = WordCloud(background_color='white', max_words=5000, contour_width=3, contour_color='steelblue')
wordcloud.generate(long_string)

# Display the word cloud
plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
In [38]:
df['desc_lemmatized'] = df['short_description'].str.replace('[^\w\s]','')
/var/folders/m_/m3lsq_594494n7k5zm6nmdtc0000gn/T/ipykernel_48720/2591372573.py:1: FutureWarning: The default value of regex will change from True to False in a future version.
  df['desc_lemmatized'] = df['short_description'].str.replace('[^\w\s]','')
In [39]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
new_words = {
    'cpos': -3.0,
    'cneg': 3.0,
}
analyser = SentimentIntensityAnalyzer()
analyser.lexicon.update(new_words)
scores=[]
for i in range(len(df['desc_lemmatized'])):
    
    score = analyser.polarity_scores(df['desc_lemmatized'][i])
    score=score['compound']
    scores.append(score)
sentiment=[]
for i in scores:
    if i>=0.75:
        sentiment.append('Overly Positive')
    elif (i>=0.05) and (i<0.75):
        sentiment.append('Positive')
    elif i<=(-0.75):
        sentiment.append('Overly Negative')
    elif (i<=-0.05) and (i>-0.75):
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')
df['sentiment']= pd.Series(np.array(sentiment))
In [40]:
df['score']= pd.Series(np.array(scores))
In [60]:
df.groupby(by="sentiment").mean()
/var/folders/m_/m3lsq_594494n7k5zm6nmdtc0000gn/T/ipykernel_48720/4284074795.py:1: FutureWarning:

The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.

Out[60]:
Unnamed: 0 score
sentiment
Negative 4008.986454 -0.402634
Neutral 4003.686468 0.000086
Overly Negative 4112.705674 -0.833058
Overly Positive 3949.543974 0.829630
Positive 3977.015938 0.418560
In [62]:
df.head()
Out[62]:
Unnamed: 0 category headline authors link short_description date sentiment desc_lemmatized score
0 0 POLITICS Watch Highlights From The Last GOP Debate Befo... Amber Ferguson https://www.huffingtonpost.com/entry/new-hamps... Marco Rubio had a rough night. 2016-02-07 Neutral Marco Rubio had a rough night 0.0000
1 1 SPORTS Bob Costas And His Fedora Are 'Thursday Night ... Lee Moran https://www.huffingtonpost.com/entry/bob-costa... "Bob Costas' hat just got its own Martin Scors... 2016-12-23 Neutral Bob Costas hat just got its own Martin Scorses... 0.0000
2 2 POLITICS Hillary Clinton Met Privately With Elizabeth W... NaN https://www.huffingtonpost.com/entry/hillary-c... nan 2015-02-17 Neutral nan 0.0000
3 3 POLITICS An Open Letter to My Fellow Millennials on Hil... Nick Laure, ContributorAn advocate for logical... https://www.huffingtonpost.com/entry/an-open-l... I am not asking anyone to stop supporting Bern... 2016-03-20 Overly Positive I am not asking anyone to stop supporting Bern... 0.9136
4 4 POLITICS Key California Lawmaker Steps Down Amid Harass... Mollie Reilly https://www.huffingtonpost.com/entry/raul-boca... The state assemblyman announced Monday he'll r... 2017-11-21 Overly Negative The state assemblyman announced Monday hell re... -0.7906
In [73]:
import pandas as pd
import jinja2


plt.figure(figsize=(12,6))
sns.countplot(x='sentiment',data=df)
fig = go.Figure(go.Funnelarea(
    text =temp.sentiment,
    values = temp.desc_lemmatized,
    title = {"position": "top center", "text": "Funnel-Chart of Sentiment Distribution"}
    ))
fig.show()
In [89]:
from sklearn.feature_selection import mutual_info_classif

PCA to analyse other feature authors, data,their relationship wiht category¶

In [384]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
In [401]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load your DataFrame
frame = pd.read_csv("/Users/luqiansong/Desktop/22201381.csv")
In [414]:
# One-hot encode the "category" and "authors" columns
df_encoded = pd.get_dummies(frame, columns=["authors", "date","category"])

# Remove non-numeric columns (headline, link, short_description, and date)
non_numeric_columns = ["headline", "link", "short_description", "date"]
df_encoded = df_encoded.drop(columns=non_numeric_columns, errors='ignore')


scaler = StandardScaler()
data_scaled = scaler.fit_transform(df_encoded)

# Perform PCA
pca = PCA()
components = pca.fit_transform(data_scaled)
components
Out[414]:
array([[-8.69705664e-01,  6.26427880e-01,  7.22034372e-03, ...,
         3.58492191e-16,  1.88440731e-16, -1.14092553e-18],
       [ 1.95225394e+00,  5.29834071e-01,  9.34774259e-03, ...,
         1.41763971e-17, -6.35970648e-17, -3.07950819e-20],
       [-9.98525823e-01,  7.76308279e-01,  4.16426205e-03, ...,
         5.57767221e-18,  3.21935501e-17, -1.34339582e-20],
       ...,
       [ 3.32528900e+00, -7.16890127e-02,  5.15406559e-03, ...,
         1.28225445e-17,  2.94217825e-17, -2.80420349e-20],
       [-7.73387055e-01, -6.20599006e-01,  8.75271764e-03, ...,
        -1.32933787e-17,  3.74208307e-17,  9.37633396e-20],
       [-7.79325944e-01, -1.13166481e+00,  7.75873425e-03, ...,
         3.98750236e-18,  5.44806642e-18, -5.19434845e-21]])
In [430]:
from pandas import DataFrame as df
# Get the explained variance ratio to understand the contribution of each principal component
explained_variance = pca.explained_variance_ratio_

# Print the explained variance ratio for each principal component
print("Explained Variance Ratio for Principal Components:")
print(explained_variance)
Explained Variance Ratio for Principal Components:
[9.64377190e-04 6.13430832e-04 5.98622072e-04 ... 3.11243690e-36
 1.95600636e-36 7.42489189e-42]
In [432]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
In [427]:
from sklearn.feature_selection import mutual_info_classif
In [428]:
#Set our X and y
X = frame.loc[:,"authors":"date"]
y = frame.loc[:,"category"]
In [429]:
X
Out[429]:
authors link short_description date
0 Amber Ferguson https://www.huffingtonpost.com/entry/new-hamps... Marco Rubio had a rough night. 2016-02-07
1 Lee Moran https://www.huffingtonpost.com/entry/bob-costa... "Bob Costas' hat just got its own Martin Scors... 2016-12-23
2 NaN https://www.huffingtonpost.com/entry/hillary-c... NaN 2015-02-17
3 Nick Laure, ContributorAn advocate for logical... https://www.huffingtonpost.com/entry/an-open-l... I am not asking anyone to stop supporting Bern... 2016-03-20
4 Mollie Reilly https://www.huffingtonpost.com/entry/raul-boca... The state assemblyman announced Monday he'll r... 2017-11-21
... ... ... ... ...
7995 NaN https://www.huffingtonpost.comhttp://www.nytim... Hillary Clinton’s advisers and allies have beg... 2016-04-23
7996 Travis Waldron https://www.huffingtonpost.com/entry/raiders-n... Gambling advocates believe the NFL's embrace o... 2017-03-30
7997 Chris Greenberg https://www.huffingtonpost.com/entry/giants-wo... NaN 2014-10-30
7998 Chris D'Angelo https://www.huffingtonpost.com/entry/trump-lif... The decision was made public by none other tha... 2017-11-16
7999 The Trace, Editorial Partner https://www.huffingtonpost.com/entry/states-se... After talking to New Jersey’s compensation off... 2018-02-12

8000 rows × 4 columns

In [440]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA

# Convert the target variable (y) to numeric format using label encoding
label_encoder = LabelEncoder()
y_numeric = label_encoder.fit_transform(y)

# Convert the feature columns (X) to numeric format using one-hot encoding

one_hot_encoder = OneHotEncoder()
X_numeric = one_hot_encoder.fit_transform(X)

# Calculate mutual information scores
i_scores = mutual_info_classif(X_numeric, y_numeric)
In [441]:
output = list(zip(frame.columns,i_scores))
i_frame = df(output,columns=["Feature","I-gain"])
i_frame
Out[441]:
Feature I-gain
0 Unnamed: 0 0.000036
1 category 0.000036
2 headline 0.000036
3 authors 0.000173
4 link 0.000036
5 short_description 0.000173
6 date 0.000036
In [442]:
i_frame = i_frame.sort_values(by=['I-gain'],ascending=False) 
i_frame = i_frame.reset_index() 
i_frame
Out[442]:
index Feature I-gain
0 3 authors 0.000173
1 5 short_description 0.000173
2 0 Unnamed: 0 0.000036
3 1 category 0.000036
4 2 headline 0.000036
5 4 link 0.000036
6 6 date 0.000036
In [443]:
#i_frame = i_frame.drop("index",axis=1)
i_frame = i_frame.set_index("Feature")
i_frame.plot.bar()
Out[443]:
<Axes: xlabel='Feature'>

find the most common term for each categoryies¶

In [84]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix

complete_data = pd.read_csv('/Users/luqiansong/Desktop/22201381.csv')
data = complete_data[['short_description','category']]
In [85]:
data
Out[85]:
short_description category
0 Marco Rubio had a rough night. POLITICS
1 "Bob Costas' hat just got its own Martin Scors... SPORTS
2 NaN POLITICS
3 I am not asking anyone to stop supporting Bern... POLITICS
4 The state assemblyman announced Monday he'll r... POLITICS
... ... ...
7995 Hillary Clinton’s advisers and allies have beg... POLITICS
7996 Gambling advocates believe the NFL's embrace o... SPORTS
7997 NaN SPORTS
7998 The decision was made public by none other tha... POLITICS
7999 After talking to New Jersey’s compensation off... POLITICS

8000 rows × 2 columns

In [47]:
data['short_description'].isnull().sum()
Out[47]:
850
In [48]:
data['category'].value_counts()
Out[48]:
POLITICS    5983
SPORTS      1997
Name: category, dtype: int64
In [49]:
positive_class = data[data['category']=='SPORTS']['short_description']
negative_class = data[data['category']=='POLITICS']['short_description']
In [50]:
negative_class
Out[50]:
0                          Marco Rubio had a rough night.
2                                                     NaN
3       I am not asking anyone to stop supporting Bern...
4       The state assemblyman announced Monday he'll r...
6       Cross-posted with TomDispatch.com Since 9/11, ...
                              ...                        
7993    The state had been banned from hosting NCAA ga...
7994         Education policy is not immune to fake news.
7995    Hillary Clinton’s advisers and allies have beg...
7998    The decision was made public by none other tha...
7999    After talking to New Jersey’s compensation off...
Name: short_description, Length: 5983, dtype: object
In [63]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english')
positive_class = positive_class.fillna('')
words_in_positive_class = vectorizer.fit_transform(positive_class)
tokens_and_counts = zip(vectorizer.get_feature_names(), np.asarray(words_in_positive_class.sum(axis=0)).ravel())
/Users/luqiansong/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
  warnings.warn(msg, category=FutureWarning)
In [64]:
df_tokens = pd.DataFrame(tokens_and_counts, columns=['Token', 'Count'])
df_tokens
Out[64]:
Token Count
0 000 6
1 002 1
2 029 1
3 04 1
4 09 1
... ... ...
5453 zealand 1
5454 zero 2
5455 zika 3
5456 zone 1
5457 zurich 1

5458 rows × 2 columns

In [65]:
df_tokens.sort_values("Count", ascending=False, inplace=True)
df_tokens.reset_index(inplace=True, drop=True)
df_tokens
Out[65]:
Token Count
0 game 96
1 just 80
2 team 73
3 like 69
4 nfl 65
... ... ...
5453 gym 1
5454 gymnasts 1
5455 gyms 1
5456 hacking 1
5457 zurich 1

5458 rows × 2 columns

In [66]:
most_popular_tokens = df_tokens.nlargest(columns="Count", n=15)
most_popular_tokens
Out[66]:
Token Count
0 game 96
1 just 80
2 team 73
3 like 69
4 nfl 65
5 football 63
6 said 62
7 sports 61
8 time 60
9 new 55
10 year 54
11 win 48
12 players 48
13 league 45
14 season 44
In [67]:
least_popular_tokens = df_tokens.nsmallest(columns="Count", n=15) 
least_popular_tokens
Out[67]:
Token Count
2154 spanning 1
2155 phalange 1
2156 pickup 1
2157 weapon 1
2158 pita 1
2159 spartans 1
2160 weaknesses 1
2161 pitched 1
2162 wbz 1
2163 philip 1
2164 southpaw 1
2165 wbc 1
2166 warrant 1
2167 phelps 1
2168 sweden 1
In [68]:
fig, axes = plt.subplots(2, 1, figsize=(20,8))
sns.barplot(ax=axes[0], data=least_popular_tokens, x="Token", y ="Count")
sns.barplot(ax=axes[1], data=most_popular_tokens, x="Token", y ="Count")
axes[0].set(ylabel='Counts', xlabel="Tokens", title="%d Least Frequent Tokens After Stop Word Removal" % 20 )
axes[1].set(ylabel='Counts', xlabel="Tokens", title="%d Most Frequent Tokens After Stop Word Removal" % 20 )
plt.tight_layout()
In [ ]:
# as shown above, the 20 most frequent words are:spanning, phalange, pickup, weapon, pita, spartans, weaknesses, pithced,wbs, philip, southpaw, wbc, warrant, phelps, sweden

TASK2 :DATA Preparation &Modelling¶

In [69]:
data
Out[69]:
short_description category
0 Marco Rubio had a rough night. POLITICS
1 "Bob Costas' hat just got its own Martin Scors... SPORTS
2 NaN POLITICS
3 I am not asking anyone to stop supporting Bern... POLITICS
4 The state assemblyman announced Monday he'll r... POLITICS
... ... ...
7995 Hillary Clinton’s advisers and allies have beg... POLITICS
7996 Gambling advocates believe the NFL's embrace o... SPORTS
7997 NaN SPORTS
7998 The decision was made public by none other tha... POLITICS
7999 After talking to New Jersey’s compensation off... POLITICS

8000 rows × 2 columns

In [86]:
import pandas as pd


data.dropna(inplace=True)


X = data['short_description']
y = data['category']# Split the dataset
X_train_plus_valid, X_test, y_train_plus_valid, y_test = train_test_split(X, y, random_state=0, test_size = 0.30, train_size = 0.7, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_plus_valid, y_train_plus_valid, random_state=0, test_size = 0.199/0.7, train_size = 0.5/0.7, stratify=y_train_plus_valid)
/var/folders/m_/m3lsq_594494n7k5zm6nmdtc0000gn/T/ipykernel_34631/2359622052.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(inplace=True)
In [87]:
type(X_valid)
Out[87]:
pandas.core.series.Series
In [88]:
print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)
(3565,)
(1419,)
(2139,)
In [ ]:
# Generating a bag-of-words model
In [79]:
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(X_train)
Out[79]:
CountVectorizer(stop_words='english')
In [89]:
X_train_matrix = vectorizer.transform(X_train)
X_valid_matrix = vectorizer.transform(X_valid)
X_test_matrix = vectorizer.transform(X_test)
In [90]:
print(X_train_matrix.shape)
print(X_valid_matrix.shape)
print(X_test_matrix.shape)
(3565, 8958)
(1419, 8958)
(2139, 8958)
In [91]:
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(X_train_matrix, y_train)
Out[91]:
KNeighborsClassifier(n_neighbors=1)
In [92]:
# # 
X_cm = X_train_matrix
y_true_labels = y_train
model = neigh
# Apply trained model to new dataset
y_pred = model.predict(X_cm)
print(metrics.classification_report(y_true_labels, y_pred))
cm=confusion_matrix(y_true_labels, y_pred)
ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix Training Set'); 
/Users/luqiansong/opt/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
              precision    recall  f1-score   support

    POLITICS       1.00      1.00      1.00      2699
      SPORTS       0.99      1.00      0.99       866

    accuracy                           1.00      3565
   macro avg       1.00      1.00      1.00      3565
weighted avg       1.00      1.00      1.00      3565

In [93]:
X_cm = X_valid_matrix
y_true_labels = y_valid
model = neigh
# Apply trained model to new dataset
y_pred = model.predict(X_cm)
print(metrics.classification_report(y_true_labels, y_pred))
cm=confusion_matrix(y_true_labels, y_pred)
ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix Validation Set'); 
              precision    recall  f1-score   support

    POLITICS       0.77      0.89      0.82      1075
      SPORTS       0.32      0.17      0.22       344

    accuracy                           0.71      1419
   macro avg       0.55      0.53      0.52      1419
weighted avg       0.66      0.71      0.68      1419

/Users/luqiansong/opt/anaconda3/lib/python3.9/site-packages/sklearn/neighbors/_classification.py:228: FutureWarning: Unlike other reduction functions (e.g. `skew`, `kurtosis`), the default behavior of `mode` typically preserves the axis it acts along. In SciPy 1.11.0, this behavior will change: the default value of `keepdims` will become False, the `axis` over which the statistic is taken will be eliminated, and the value None will no longer be accepted. Set `keepdims` to True or False to avoid this warning.
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
In [94]:
# y_pred is a numpy array. It needs to be converted.
y_pred.shape
Out[94]:
(1419,)
In [95]:
print(len(X_valid))
print(len(y_valid))
print(len(y_pred))
1419
1419
1419
In [96]:
y_pred = pd.DataFrame(y_pred, index=X_valid.index, columns=['y_pred'])
In [97]:
valid_X_y = pd.concat([X_valid,y_valid,y_pred], axis=1)
In [98]:
valid_X_y
Out[98]:
short_description category y_pred
871 R.I.P. Omer Asik. SPORTS POLITICS
7471 The 2016 election was a stunning blow to the m... POLITICS POLITICS
1400 "They’re gonna start some s**t." POLITICS POLITICS
7893 A day after a gunman opened fire on a practice... POLITICS POLITICS
2911 So far, the ads look nothing like NASCAR. SPORTS POLITICS
... ... ... ...
136 To say he doesn't really care. SPORTS POLITICS
65 A Quinnipiac poll found that 59 percent of wom... POLITICS POLITICS
6094 Holly Rowe of ESPN showed exactly what society... SPORTS POLITICS
2874 American Lindsey Vonn missed the podium with a... SPORTS POLITICS
180 Ben Carson, the retired neurosurgeon who brief... POLITICS POLITICS

1419 rows × 3 columns

I set Training set: 70% of the data,Validation set: 15% of the data, Test set: 15% of the data¶

In [64]:
# split the dataset with a 70-15-15 ratio for the training, validation, and test sets, respectively
from sklearn.model_selection import train_test_split

# Split the dataset into training, validation, and test sets
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Save the datasets as separate CSV files
train_df.to_csv('train.csv', index=False)
valid_df.to_csv('valid.csv', index=False)
test_df.to_csv('test.csv', index=False)
In [4]:
import pandas as pd
# Load train.csv
train_data = pd.read_csv('/Users/luqiansong/Desktop/train.csv')

# Load valid.csv
valid_data = pd.read_csv('/Users/luqiansong/Desktop/valid.csv')
In [20]:
# check train_data info
train_data.head()
Out[20]:
category headline authors link short_description date headline_length short_description_length
0 SPORTS Jack Sock Retires From U.S. Open Match After S... NaN https://www.huffingtonpost.com/entry/jack-sock... American Jack Sock was overcome by the heat an... 2015-09-03 70 119.0
1 POLITICS Former Mexican President Vicente Fox Issues St... Lee Moran https://www.huffingtonpost.com/entry/vicente-f... "You better speak up, because this guy is taki... 2018-04-06 85 65.0
2 POLITICS Kurds and US vs ISIS Ryan Campbell, ContributorEditor at DRM Capito... https://www.huffingtonpost.com/entry/kurds-and... Allying with Kurdish forces means we can bette... 2014-11-19 20 206.0
3 POLITICS Anthony Scaramucci, We Hardly Knew Ye Marina Fang https://www.huffingtonpost.com/entry/anthony-s... 10 highlights from the Mooch's 10 days as Whit... 2017-07-31 37 78.0
4 SPORTS Super Bowl Commercials 2014: Watch All Ads Air... Chris Greenberg https://www.huffingtonpost.com/entry/super-bow... CLICK HERE to watch the 50 Greatest Super Bowl... 2014-02-02 90 127.0
In [21]:
#All LowerCase for train_data
import sys,csv,re
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def converter(x):
    try:
        return ' '.join([x.lower() for x in str(x).split() if x not in stop_words])
    except AttributeError:
        return None  
train_data['short_description'] = train_data['short_description'].apply(converter)
In [22]:
# check train_data lowercase situation
train_data.head()
Out[22]:
category headline authors link short_description date headline_length short_description_length
0 SPORTS Jack Sock Retires From U.S. Open Match After S... NaN https://www.huffingtonpost.com/entry/jack-sock... american jack sock overcome heat retire fourth... 2015-09-03 70 119.0
1 POLITICS Former Mexican President Vicente Fox Issues St... Lee Moran https://www.huffingtonpost.com/entry/vicente-f... "you better speak up, guy taking nowhere." 2018-04-06 85 65.0
2 POLITICS Kurds and US vs ISIS Ryan Campbell, ContributorEditor at DRM Capito... https://www.huffingtonpost.com/entry/kurds-and... allying kurdish forces means better fight isis... 2014-11-19 20 206.0
3 POLITICS Anthony Scaramucci, We Hardly Knew Ye Marina Fang https://www.huffingtonpost.com/entry/anthony-s... 10 highlights mooch's 10 days white house comm... 2017-07-31 37 78.0
4 SPORTS Super Bowl Commercials 2014: Watch All Ads Air... Chris Greenberg https://www.huffingtonpost.com/entry/super-bow... click here watch 50 greatest super bowl commer... 2014-02-02 90 127.0
In [23]:
#Removing Punctuation in train_data
train_data['description_punc'] =train_data['short_description'].str.replace('[^\w\s]','')
In [24]:
#Removal of stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')
train_data['description_stop']  = train_data['description_punc'] .apply(lambda x: " ".join(x for x in x.split() if x not in stop))
train_data.head()
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/luqiansong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Out[24]:
category headline authors link short_description date headline_length short_description_length description_punc description_stop
0 SPORTS Jack Sock Retires From U.S. Open Match After S... NaN https://www.huffingtonpost.com/entry/jack-sock... american jack sock overcome heat retire fourth... 2015-09-03 70 119.0 american jack sock overcome heat retire fourth... american jack sock overcome heat retire fourth...
1 POLITICS Former Mexican President Vicente Fox Issues St... Lee Moran https://www.huffingtonpost.com/entry/vicente-f... "you better speak up, guy taking nowhere." 2018-04-06 85 65.0 "you better speak up, guy taking nowhere." "you better speak up, guy taking nowhere."
2 POLITICS Kurds and US vs ISIS Ryan Campbell, ContributorEditor at DRM Capito... https://www.huffingtonpost.com/entry/kurds-and... allying kurdish forces means better fight isis... 2014-11-19 20 206.0 allying kurdish forces means better fight isis... allying kurdish forces means better fight isis...
3 POLITICS Anthony Scaramucci, We Hardly Knew Ye Marina Fang https://www.huffingtonpost.com/entry/anthony-s... 10 highlights mooch's 10 days white house comm... 2017-07-31 37 78.0 10 highlights mooch's 10 days white house comm... 10 highlights mooch's 10 days white house comm...
4 SPORTS Super Bowl Commercials 2014: Watch All Ads Air... Chris Greenberg https://www.huffingtonpost.com/entry/super-bow... click here watch 50 greatest super bowl commer... 2014-02-02 90 127.0 click here watch 50 greatest super bowl commer... click watch 50 greatest super bowl commercials...
In [25]:
#Tokenization 
import textblob           
from textblob import TextBlob
def tokenization(short_description):
    short_description = re.split('\W+', short_description)
    return short_description
train_data['description_tokenized'] = train_data['description_stop'].apply(lambda x: tokenization(x.lower()))
In [26]:
#Lemmatization is a more effective option than stemming because it converts the word into its root word, 
#rather than just stripping the suffices.
#nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()
def lemmatizer(short_description):
    short_description = [wn.lemmatize(word) for word in short_description]
    return short_description
train_data['description_lemmatized'] = train_data['description_tokenized'].apply(lambda x: lemmatizer(x))
In [23]:
# check all the transformation to the train data 
train_data[['short_description', 'description_punc', 'description_tokenized','description_stop','description_lemmatized']][0:9]
Out[23]:
short_description description_punc description_tokenized description_stop description_lemmatized
0 american jack sock overcome heat retire fourth... american jack sock overcome heat retire fourth... [american, jack, sock, overcome, heat, retire,... american jack sock overcome heat retire fourth... [american, jack, sock, overcome, heat, retire,...
1 "you better speak up, guy taking nowhere." "you better speak up, guy taking nowhere." [, you, better, speak, up, guy, taking, nowher... "you better speak up, guy taking nowhere." [, you, better, speak, up, guy, taking, nowher...
2 allying kurdish forces means better fight isis... allying kurdish forces means better fight isis... [allying, kurdish, forces, means, better, figh... allying kurdish forces means better fight isis... [allying, kurdish, force, mean, better, fight,...
3 10 highlights mooch's 10 days white house comm... 10 highlights mooch's 10 days white house comm... [10, highlights, mooch, s, 10, days, white, ho... 10 highlights mooch's 10 days white house comm... [10, highlight, mooch, s, 10, day, white, hous...
4 click here watch 50 greatest super bowl commer... click here watch 50 greatest super bowl commer... [click, watch, 50, greatest, super, bowl, comm... click watch 50 greatest super bowl commercials... [click, watch, 50, greatest, super, bowl, comm...
5 here's hoping dirt winter. here's hoping dirt winter. [here, s, hoping, dirt, winter, ] here's hoping dirt winter. [here, s, hoping, dirt, winter, ]
6 a visit trump tower reveals donald trump famil... a visit trump tower reveals donald trump famil... [visit, trump, tower, reveals, donald, trump, ... visit trump tower reveals donald trump family ... [visit, trump, tower, reveals, donald, trump, ...
7 so might time rethink administration's refugee... so might time rethink administration's refugee... [might, time, rethink, administration, s, refu... might time rethink administration's refugee po... [might, time, rethink, administration, s, refu...
8 hillary clinton's testimony mostly confirmed p... hillary clinton's testimony mostly confirmed p... [hillary, clinton, s, testimony, mostly, confi... hillary clinton's testimony mostly confirmed p... [hillary, clinton, s, testimony, mostly, confi...
In [32]:
train_data.drop(columns=['description_punc', 'description_tokenized', 'description_stop','headline_length','short_description_length'])
Out[32]:
category headline authors link short_description date description_lemmatized
0 SPORTS Jack Sock Retires From U.S. Open Match After S... NaN https://www.huffingtonpost.com/entry/jack-sock... american jack sock overcome heat retire fourth... 2015-09-03 [american, jack, sock, overcome, heat, retire,...
1 POLITICS Former Mexican President Vicente Fox Issues St... Lee Moran https://www.huffingtonpost.com/entry/vicente-f... "you better speak up, guy taking nowhere." 2018-04-06 [, you, better, speak, up, guy, taking, nowher...
2 POLITICS Kurds and US vs ISIS Ryan Campbell, ContributorEditor at DRM Capito... https://www.huffingtonpost.com/entry/kurds-and... allying kurdish forces means better fight isis... 2014-11-19 [allying, kurdish, force, mean, better, fight,...
3 POLITICS Anthony Scaramucci, We Hardly Knew Ye Marina Fang https://www.huffingtonpost.com/entry/anthony-s... 10 highlights mooch's 10 days white house comm... 2017-07-31 [10, highlight, mooch, s, 10, day, white, hous...
4 SPORTS Super Bowl Commercials 2014: Watch All Ads Air... Chris Greenberg https://www.huffingtonpost.com/entry/super-bow... click here watch 50 greatest super bowl commer... 2014-02-02 [click, watch, 50, greatest, super, bowl, comm...
... ... ... ... ... ... ... ...
5595 POLITICS U.S. Military Cancels Hearing For 9/11 Suspects NaN https://www.huffingtonpost.com/entry/military-... another setback government efforts try five me... 2015-08-16 [another, setback, government, effort, try, fi...
5596 POLITICS Repealing Obamacare Is A Trap For The GOP, Chu... Michael McAuliff https://www.huffingtonpost.com/entry/chuck-sch... "they regret day it.” 2016-11-22 [, they, regret, day, it, ]
5597 SPORTS Top-Tier Gymnast Maggie Nichols Says Larry Nas... Alanna Vagianos https://www.huffingtonpost.com/entry/gymnast-m... nichols wrote statement first alert usa gymnas... 2018-01-09 [nichols, wrote, statement, first, alert, usa,...
5598 POLITICS Why Scott Walker Will Never Be President NaN https://www.huffingtonpost.com/entry/scott-wal... nan 2014-06-21 [nan]
5599 POLITICS Paul Ryan Just Got The Sweetest Deal In Congress Zach Carter https://www.huffingtonpost.com/entry/paul-ryan... and democrats thrilled. 2015-10-23 [democrat, thrilled, ]

5600 rows × 7 columns

In [6]:
valid_data = pd.read_csv('/Users/luqiansong/Desktop/valid.csv')
# check text_data info
valid_data.head()
Out[6]:
category headline authors link short_description date headline_length short_description_length
0 POLITICS HuffPost Rise: What You Need To Know On April 21 NaN https://www.huffingtonpost.com/entry/huffpost-... Welcome to the HuffPost Rise Morning Newsbrief... 2016-04-21 48 103.0
1 POLITICS How The Thomas Fire Could Affect An Already St... Antonia Blumberg https://www.huffingtonpost.com/entry/southern-... "I don’t think Ventura County is well-position... 2017-12-12 73 83.0
2 SPORTS Michael Phelps To U.S. Olympic Committee: Do S... Jenna Amatulli https://www.huffingtonpost.com/entry/michael-p... Most athletes experience post-Olympics depress... 2018-04-02 83 84.0
3 POLITICS HUFFPOST HILL - Better Call Lanny Eliot Nelson https://www.huffingtonpost.com/entry/huffpost-... NaN 2014-09-17 33 NaN
4 POLITICS Trump Booed At Davos For Criticizing 'Fake' Media Marina Fang https://www.huffingtonpost.com/entry/trump-dav... The president dismissed reports that he ordere... 2018-01-26 49 108.0
In [33]:
train_data.to_csv('train_Cleaned.csv')
In [7]:
# doing transformation on valid data

# Load valid.csv
valid_data = pd.read_csv('/Users/luqiansong/Desktop/valid.csv')

#All LowerCase for train_data
import sys,csv,re
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def converter(x):
    try:
        return ' '.join([x.lower() for x in str(x).split() if x not in stop_words])
    except AttributeError:
        return None  
valid_data['short_description'] = valid_data['short_description'].apply(converter)
In [8]:
#Removing Punctuation in train_data
valid_data['description_punc'] =valid_data['short_description'].str.replace('[^\w\s]','')
In [9]:
#Removal of stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')
valid_data['description_stop']  = valid_data['description_punc'] .apply(lambda x: " ".join(x for x in x.split() if x not in stop))
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/luqiansong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [10]:
#Tokenization 
import textblob           
from textblob import TextBlob
def tokenization(short_description):
    short_description = re.split('\W+', short_description)
    return short_description
valid_data['description_tokenized'] = valid_data['description_stop'].apply(lambda x: tokenization(x.lower()))
In [11]:
#Lemmatization 
#nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()
def lemmatizer(short_description):
    short_description = [wn.lemmatize(word) for word in short_description]
    return short_description
valid_data['description_lemmatized'] = valid_data['description_tokenized'].apply(lambda x: lemmatizer(x))
In [12]:
# check all the transformation to the valid data 
valid_data[['short_description', 'description_punc', 'description_tokenized','description_stop','description_lemmatized']][0:9]
Out[12]:
short_description description_punc description_tokenized description_stop description_lemmatized
0 welcome huffpost rise morning newsbrief, short... welcome huffpost rise morning newsbrief, short... [welcome, huffpost, rise, morning, newsbrief, ... welcome huffpost rise morning newsbrief, short... [welcome, huffpost, rise, morning, newsbrief, ...
1 "i don’t think ventura county well-positioned ... "i don’t think ventura county well-positioned ... [, i, don, t, think, ventura, county, well, po... "i don’t think ventura county well-positioned ... [, i, don, t, think, ventura, county, well, po...
2 most athletes experience post-olympics depress... most athletes experience post-olympics depress... [athletes, experience, post, olympics, depress... athletes experience post-olympics depression, ... [athlete, experience, post, olympics, depressi...
3 nan nan [nan] nan [nan]
4 the president dismissed reports ordered firing... the president dismissed reports ordered firing... [president, dismissed, reports, ordered, firin... president dismissed reports ordered firing spe... [president, dismissed, report, ordered, firing...
5 a lower court found ban disproportionately aff... a lower court found ban disproportionately aff... [lower, court, found, ban, disproportionately,... lower court found ban disproportionately affec... [lower, court, found, ban, disproportionately,...
6 the ruling paves way states legalize sports be... the ruling paves way states legalize sports be... [ruling, paves, way, states, legalize, sports,... ruling paves way states legalize sports bettin... [ruling, pave, way, state, legalize, sport, be...
7 hillary clinton recently dismissed idea gettin... hillary clinton recently dismissed idea gettin... [hillary, clinton, recently, dismissed, idea, ... hillary clinton recently dismissed idea gettin... [hillary, clinton, recently, dismissed, idea, ...
8 like read below? sign huffpost hill get cheeky... like read below? sign huffpost hill get cheeky... [like, read, below, sign, huffpost, hill, get,... like read below? sign huffpost hill get cheeky... [like, read, below, sign, huffpost, hill, get,...
In [17]:
valid_data.drop(columns=['headline_length', 'short_description_length', 'description_stop', 'description_tokenized','description_punc'])
Out[17]:
category headline authors link short_description date description_lemmatized
0 POLITICS HuffPost Rise: What You Need To Know On April 21 NaN https://www.huffingtonpost.com/entry/huffpost-... welcome huffpost rise morning newsbrief, short... 2016-04-21 [welcome, huffpost, rise, morning, newsbrief, ...
1 POLITICS How The Thomas Fire Could Affect An Already St... Antonia Blumberg https://www.huffingtonpost.com/entry/southern-... "i don’t think ventura county well-positioned ... 2017-12-12 [, i, don, t, think, ventura, county, well, po...
2 SPORTS Michael Phelps To U.S. Olympic Committee: Do S... Jenna Amatulli https://www.huffingtonpost.com/entry/michael-p... most athletes experience post-olympics depress... 2018-04-02 [athlete, experience, post, olympics, depressi...
3 POLITICS HUFFPOST HILL - Better Call Lanny Eliot Nelson https://www.huffingtonpost.com/entry/huffpost-... nan 2014-09-17 [nan]
4 POLITICS Trump Booed At Davos For Criticizing 'Fake' Media Marina Fang https://www.huffingtonpost.com/entry/trump-dav... the president dismissed reports ordered firing... 2018-01-26 [president, dismissed, report, ordered, firing...
... ... ... ... ... ... ... ...
1195 SPORTS NCAA Tournament Teams, Seeds 2014: March Madne... NaN https://www.huffingtonpost.com/entry/ncaa-tour... shortly conference tournaments wrapped selecti... 2014-03-16 [shortly, conference, tournament, wrapped, sel...
1196 POLITICS Obama Condemns 'Cynical' GOP Race Baiting In V... Mollie Reilly https://www.huffingtonpost.com/entry/obama-ral... "i don’t think anybody really thinks somebody ... 2017-10-20 [, i, don, t, think, anybody, really, think, s...
1197 SPORTS Happy Kid Dancing To 'Happy' At A Basketball G... Lucy McCalmont https://www.huffingtonpost.com/entry/kid-danci... kid dances basketball game, amazing life. 2015-03-27 [kid, dance, basketball, game, amazing, life, ]
1198 POLITICS Donald Trump's Labor Pick Would Be Expected To... Dave Jamieson https://www.huffingtonpost.com/entry/trumps-la... hardee's agreed pay workers nearly $60,000 run... 2016-12-09 [hardee, s, agreed, pay, worker, nearly, 60, 0...
1199 POLITICS HUFFPOLLSTER: Young Americans Heavily Favor Tr... Natalie Jackson, Ariel Edwards-Levy, and Janie... https://www.huffingtonpost.com/entry/transgend... the public whole split, 18-29-year-olds much p... 2016-04-21 [public, whole, split, 18, 29, year, old, much...

1200 rows × 7 columns

In [19]:
valid_data.to_csv('valid_Cleaned.csv')
In [ ]:
# I stored train_cleaned and valid_cleaned file after transforamtion only keep description_lemmatized
In [120]:
# Load train.csv
train_cleandata = pd.read_csv('/Users/luqiansong/Desktop/train_Cleaned.csv')

# Load valid.csv
valid_cleandata = pd.read_csv('/Users/luqiansong/Desktop/valid_Cleaned.csv')

Build Binary classification models, using logistic regression and random forest these two classifiers between two categories, # the parameters are description-lemmatized¶

classifiers 1: logistic Regression,reason:it performs well when the realtionship between the features and the artget variabl :target vaiable can take two outcomes,shos the accuracy is 0.87¶

In [124]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# classifiers 1: logistic Regression:
# Select the 'short_description' columns as text features
text_features = ['description_lemmatized']
text_data = train_cleandata[text_features]

# Combine the text data into a single column
text_data['combined_text'] = text_data[text_features].apply(lambda x: ' '.join(x), axis=1)

# Create an instance of the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the combined text data
X_train_text = vectorizer.fit_transform(text_data['combined_text'])

# Convert the TF-IDF matrix to an array
X_train_text = X_train_text.toarray()

# Select the numerical features from the original dataset
numeric_features = [ 'short_description_length']
X_train_numeric = train_cleandata[numeric_features].values

# Concatenate the text and numerical features
X_train = np.concatenate((X_train_text, X_train_numeric), axis=1)

# Select the 'category' column as the target variable
y_train = train_cleandata['category']
/var/folders/m_/m3lsq_594494n7k5zm6nmdtc0000gn/T/ipykernel_69502/1467203526.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_data['combined_text'] = text_data[text_features].apply(lambda x: ' '.join(x), axis=1)
In [127]:
test=pd.read_csv('/Users/luqiansong/Desktop/test.csv')
In [130]:
y_test=test['category']
In [137]:
#  Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#  Create and train the logistic regression model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)

#  Predict the target variable on the training data
y_train_pred = logistic_regression_model.predict(X_train)

#  Calculate the training accuracy
training_accuracy = accuracy_score(y_train, y_train_pred)

#  Display the training accuracy
print("Training Accuracy:", training_accuracy)
Training Accuracy: 0.8750749550269838
/Users/luqiansong/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
In [ ]:
#  Import the necessary library
from sklearn.metrics import confusion_matrix

#  Predict the target variable on the training data
y_train_pred = logistic_regression_model.predict(X_train)

 # Create the confusion matrix
conf_matrix = confusion_matrix(y_train, y_train_pred)

 # Display the confusion matrix
print("Confusion Matrix:")
print(conf_matrix)
Confusion Matrix:
[[3812    6]
 [ 619  566]]
In [144]:
y_train_pred = logistic_regression_model.predict(X_train)

#  Create the confusion matrix
conf_matrix = confusion_matrix(y_train, y_train_pred)

#  Display the confusion matrix with a heatmap
print("Confusion Matrix:")
print(conf_matrix)

# Display the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="YlGnBu")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix Heatmap")
plt.show()
Confusion Matrix:
[[3812    6]
 [ 619  566]]

classifer 2 :Random Forest and evaluation with accuracy 0.96 , reason:it can handle missing value and high accuracy, t¶

In [44]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Handling missing values by replacing them with zeros
train_cleandata.fillna(0, inplace=True)

# Select the 'description_lemmatized' columns as text features
text_features = ['description_lemmatized']
text_data = train_cleandata[text_features]

# Combine the text data into a single column
text_data['combined_text'] = text_data[text_features].apply(lambda x: ' '.join(x), axis=1)

# Create an instance of the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the combined text data
X_train_text = vectorizer.fit_transform(text_data['combined_text'])

# Convert the TF-IDF matrix to an array
X_train_text = X_train_text.toarray()

# Select the numerical features from the original dataset
numeric_features = ['short_description_length']
X_train_numeric = train_cleandata[numeric_features].values

# Concatenate the text and numerical features
X_train = np.concatenate((X_train_text, X_train_numeric), axis=1)

# Convert the target variable to categorical
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['category'])

# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

#  Predict on the training data
y_train_pred = rf_classifier.predict(X_train)

# Calculate accuracy on the training data
train_accuracy = accuracy_score(y_train, y_train_pred)

# Display the training accuracy
train_accuracy
/var/folders/m_/m3lsq_594494n7k5zm6nmdtc0000gn/T/ipykernel_10082/2101469619.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_data['combined_text'] = text_data[text_features].apply(lambda x: ' '.join(x), axis=1)
Out[44]:
0.9651785714285714

Apply an end-to end sequential model,¶

In [93]:
import os 
import os 
import numpy as np
import cv2
import random
import sklearn
import keras
import PIL
import matplotlib.pyplot as plt
import seaborn as sns
from keras import backend as K
from keras.layers import AveragePooling2D
from tensorflow.keras.optimizers import RMSprop
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Conv2D, MaxPooling2D, Flatten
from keras.callbacks import ModelCheckpoint
from sklearn import metrics
from sklearn.metrics import confusion_matrix

import tensorflow as tf
In [148]:
from keras.layers import Embedding
embedding_dim = 50
vocab_size=12482
max_length=150

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()
Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding_3 (Embedding)     (None, 150, 50)           624100    
                                                                 
 flatten_3 (Flatten)         (None, 7500)              0         
                                                                 
 dense_10 (Dense)            (None, 10)                75010     
                                                                 
 dense_11 (Dense)            (None, 1)                 11        
                                                                 
=================================================================
Total params: 699,121
Trainable params: 699,121
Non-trainable params: 0
_________________________________________________________________
In [151]:
model.compile(optimizer=RMSprop(learning_rate=0.0001, decay=1e-6),
              loss='binary_crossentropy',
              metrics=['accuracy'])
In [152]:
import pandas as pd
# Load train.csv
train_data = pd.read_csv('/Users/luqiansong/Desktop/train_Cleaned.csv')

# Load valid.csv
valid_data = pd.read_csv('/Users/luqiansong/Desktop/valid_Cleaned.csv')
In [153]:
X_train=train_data['description_lemmatized']
y_train_wide=train_data['category']

X_valid=valid_data['description_lemmatized']
y_valid_wide=valid_data['category']

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_valid_seq = tokenizer.texts_to_sequences(X_valid)

# Pad sequences to a fixed length
max_length = 150  
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_valid_pad = pad_sequences(X_valid_seq, maxlen=max_length)

# Rest 
history = model.fit(X_train_pad, y_train, \
          epochs=20, batch_size=32, verbose=1, \
          validation_data=(X_valid_pad, y_valid))
Epoch 1/20
175/175 [==============================] - 1s 3ms/step - loss: 0.5791 - accuracy: 0.7487 - val_loss: 0.5874 - val_accuracy: 0.7333
Epoch 2/20
175/175 [==============================] - 0s 3ms/step - loss: 0.5602 - accuracy: 0.7523 - val_loss: 0.5813 - val_accuracy: 0.7333
Epoch 3/20
175/175 [==============================] - 0s 3ms/step - loss: 0.5547 - accuracy: 0.7523 - val_loss: 0.5749 - val_accuracy: 0.7333
Epoch 4/20
175/175 [==============================] - 0s 3ms/step - loss: 0.5495 - accuracy: 0.7523 - val_loss: 0.5713 - val_accuracy: 0.7333
Epoch 5/20
175/175 [==============================] - 0s 3ms/step - loss: 0.5427 - accuracy: 0.7523 - val_loss: 0.5650 - val_accuracy: 0.7333
Epoch 6/20
175/175 [==============================] - 0s 3ms/step - loss: 0.5338 - accuracy: 0.7523 - val_loss: 0.5586 - val_accuracy: 0.7333
Epoch 7/20
175/175 [==============================] - 0s 3ms/step - loss: 0.5235 - accuracy: 0.7523 - val_loss: 0.5520 - val_accuracy: 0.7333
Epoch 8/20
175/175 [==============================] - 1s 3ms/step - loss: 0.5114 - accuracy: 0.7523 - val_loss: 0.5419 - val_accuracy: 0.7333
Epoch 9/20
175/175 [==============================] - 1s 3ms/step - loss: 0.4963 - accuracy: 0.7523 - val_loss: 0.5274 - val_accuracy: 0.7333
Epoch 10/20
175/175 [==============================] - 1s 3ms/step - loss: 0.4786 - accuracy: 0.7523 - val_loss: 0.5119 - val_accuracy: 0.7333
Epoch 11/20
175/175 [==============================] - 0s 2ms/step - loss: 0.4578 - accuracy: 0.7529 - val_loss: 0.4958 - val_accuracy: 0.7333
Epoch 12/20
175/175 [==============================] - 0s 3ms/step - loss: 0.4344 - accuracy: 0.7561 - val_loss: 0.4816 - val_accuracy: 0.7358
Epoch 13/20
175/175 [==============================] - 1s 3ms/step - loss: 0.4106 - accuracy: 0.7754 - val_loss: 0.4581 - val_accuracy: 0.7550
Epoch 14/20
175/175 [==============================] - 1s 3ms/step - loss: 0.3866 - accuracy: 0.8007 - val_loss: 0.4468 - val_accuracy: 0.7567
Epoch 15/20
175/175 [==============================] - 1s 3ms/step - loss: 0.3654 - accuracy: 0.8229 - val_loss: 0.4305 - val_accuracy: 0.7700
Epoch 16/20
175/175 [==============================] - 1s 3ms/step - loss: 0.3451 - accuracy: 0.8473 - val_loss: 0.4143 - val_accuracy: 0.8000
Epoch 17/20
175/175 [==============================] - 0s 3ms/step - loss: 0.3261 - accuracy: 0.8652 - val_loss: 0.4055 - val_accuracy: 0.8008
Epoch 18/20
175/175 [==============================] - 0s 3ms/step - loss: 0.3089 - accuracy: 0.8784 - val_loss: 0.3950 - val_accuracy: 0.8075
Epoch 19/20
175/175 [==============================] - 0s 3ms/step - loss: 0.2943 - accuracy: 0.8873 - val_loss: 0.3864 - val_accuracy: 0.8167
Epoch 20/20
175/175 [==============================] - 0s 3ms/step - loss: 0.2811 - accuracy: 0.8929 - val_loss: 0.3799 - val_accuracy: 0.8242
In [139]:
# History saves the training into a dictionary structure with the keys below
history.history.keys()
Out[139]:
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])

Task 3 Evaluation¶

Primay metrics is accuracy because it provide a simple adn intuitive measure of how well a model is performing¶

In [154]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='lower right')
plt.show()
In [155]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
# plt.ylim((-0.1, 1.1))
plt.show()
In [ ]:
 
In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
In [68]:
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(X_train)
Out[68]:
CountVectorizer(stop_words='english')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
CountVectorizer(stop_words='english')
In [70]:
X_train_matrix = vectorizer.transform(X_train)
X_valid_matrix = vectorizer.transform(X_valid)
In [83]:
# Handling missing values by replacing them with zeros
train_cleandata.fillna(0, inplace=True)

# Select the 'description_lemmatized' columns as text features
text_features = ['description_lemmatized']
text_data = train_cleandata[text_features]

# Combine the text data into a single column
text_data['combined_text'] = text_data[text_features].apply(lambda x: ' '.join(x), axis=1)

# Create an instance of the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the combined text data
X_train_text = vectorizer.fit_transform(text_data['combined_text'])

# Convert the TF-IDF matrix to an array
X_train_text = X_train_text.toarray()

# Select the numerical features from the original dataset
numeric_features = ['short_description_length']
X_train_numeric = train_cleandata[numeric_features].values

# Concatenate the text and numerical features
X_train = np.concatenate((X_train_text, X_train_numeric), axis=1)

# Convert the target variable to categorical
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['category'])
/var/folders/m_/m3lsq_594494n7k5zm6nmdtc0000gn/T/ipykernel_69502/3337987493.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  text_data['combined_text'] = text_data[text_features].apply(lambda x: ' '.join(x), axis=1)
Out[83]:
KNeighborsClassifier(n_neighbors=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier(n_neighbors=1)
In [85]:
sequential = KNeighborsClassifier(n_neighbors=1)
sequential.fit(X_train_matrix, y_train)
Out[85]:
KNeighborsClassifier(n_neighbors=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier(n_neighbors=1)
In [88]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Change X and y to the dataset 
X_cm = X_train_matrix
y_true_labels = y_train

# Convert the target variable to categorical
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['category'])

# Create and fit the KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=1)
knn_classifier.fit(X_train, y_train)

# Apply trained model to new dataset
y_pred = knn_classifier.predict(X_train)

# Print classification report
print(classification_report(y_train, y_pred))

# Create and plot the confusion matrix
cm = confusion_matrix(y_train, y_pred)
ax = plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax)
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix Training Set')
plt.show()
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      4213
           1       1.00      0.86      0.92      1374
           2       1.00      1.00      1.00        13

    accuracy                           0.97      5600
   macro avg       0.98      0.95      0.97      5600
weighted avg       0.97      0.97      0.96      5600

In [92]:
# y_pred is a numpy array. It needs to be converted.
y_pred.shape
Out[92]:
(5600,)
In [94]:
y_valid=valid_data['category']
In [100]:
print(len(X_valid))
print(len(y_valid))
print(len(y_pred))
1200
1200
5600
In [107]:
y_pred_valid = y_pred[:len(X_valid)]
In [109]:
y_pred_df = pd.DataFrame(y_pred_valid, index=X_valid.index, columns=['y_pred'])
In [111]:
valid_X_y = pd.concat([X_valid,y_valid,y_pred_df], axis=1)
In [112]:
valid_X_y
Out[112]:
description_lemmatized category y_pred
0 ['welcome', 'huffpost', 'rise', 'morning', 'ne... POLITICS 1
1 ['', 'i', 'don', 't', 'think', 'ventura', 'cou... POLITICS 0
2 ['athlete', 'experience', 'post', 'olympics', ... SPORTS 0
3 ['nan'] POLITICS 0
4 ['president', 'dismissed', 'report', 'ordered'... POLITICS 1
... ... ... ...
1195 ['shortly', 'conference', 'tournament', 'wrapp... SPORTS 1
1196 ['', 'i', 'don', 't', 'think', 'anybody', 'rea... POLITICS 0
1197 ['kid', 'dance', 'basketball', 'game', 'amazin... SPORTS 0
1198 ['hardee', 's', 'agreed', 'pay', 'worker', 'ne... POLITICS 0
1199 ['public', 'whole', 'split', '18', '29', 'year... POLITICS 0

1200 rows × 3 columns

In [115]:
# mistakes_valid for model = Sequential()
193+2
Out[115]:
195
In [ ]:
# mistakes_valid for logistic_regression_model
In [148]:
619+6
Out[148]:
625

Got different error analysis for different model,found that the senqutial model is high accuracy

Change parameter from short_description to headline and redo evaluation¶

In [201]:
complete_data = pd.read_csv('/Users/luqiansong/Desktop/22201381.csv')
data = complete_data[['headline','category']]
In [202]:
data
Out[202]:
headline category
0 Watch Highlights From The Last GOP Debate Befo... POLITICS
1 Bob Costas And His Fedora Are 'Thursday Night ... SPORTS
2 Hillary Clinton Met Privately With Elizabeth W... POLITICS
3 An Open Letter to My Fellow Millennials on Hil... POLITICS
4 Key California Lawmaker Steps Down Amid Harass... POLITICS
... ... ...
7995 Hillary Clinton’s Campaign, Cautious But Confi... POLITICS
7996 What The Raiders’ Move To Vegas Means For The ... SPORTS
7997 Giants Are World Series Champions! SPORTS
7998 Trump To Lift Ban On Import Of Elephant Trophi... POLITICS
7999 States Set Aside Millions Of Dollars For Crime... POLITICS

8000 rows × 2 columns

In [164]:
 
In [203]:
import pandas as pd


data.dropna(inplace=True)


X = data['headline']
y = data['category']
X_train_plus_valid, X_test, y_train_plus_valid, y_test = train_test_split(X, y, random_state=0, test_size = 0.30, train_size = 0.7, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_plus_valid, y_train_plus_valid, random_state=0, test_size = 0.199/0.7, train_size = 0.5/0.7, stratify=y_train_plus_valid)
/Users/luqiansong/opt/anaconda3/lib/python3.9/site-packages/pandas/util/_decorators.py:311: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
In [204]:
type(X_valid)
Out[204]:
pandas.core.series.Series
In [205]:
print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)
(3982,)
(1586,)
(2391,)
In [206]:
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(X_train)
Out[206]:
CountVectorizer(stop_words='english')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
CountVectorizer(stop_words='english')
In [207]:
X_train_matrix = vectorizer.transform(X_train)
X_valid_matrix = vectorizer.transform(X_valid)
X_test_matrix = vectorizer.transform(X_test)
In [208]:
print(X_train_matrix.shape)
print(X_valid_matrix.shape)
print(X_test_matrix.shape)
(3982, 7596)
(1586, 7596)
(2391, 7596)
In [209]:
df=data
In [210]:
neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(X_train_matrix, y_train)
Out[210]:
KNeighborsClassifier(n_neighbors=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier(n_neighbors=1)
In [211]:
# # Change X and y to the dataset 
X_cm = X_train_matrix
y_true_labels = y_train
model = neigh
# Apply trained model to new dataset
y_pred = model.predict(X_cm)
print(metrics.classification_report(y_true_labels, y_pred))
cm=confusion_matrix(y_true_labels, y_pred)
ax= plt.subplot()
sns.heatmap(cm, annot=True, fmt='g', ax=ax); 
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix Training Set'); 
              precision    recall  f1-score   support

    POLITICS       1.00      1.00      1.00      2985
      SPORTS       1.00      1.00      1.00       997

    accuracy                           1.00      3982
   macro avg       1.00      1.00      1.00      3982
weighted avg       1.00      1.00      1.00      3982

In [251]:
# Extract features (X) and labels (y)
X = data['headline']
y = data['category']
In [252]:
X
Out[252]:
0       Watch Highlights From The Last GOP Debate Befo...
1       Bob Costas And His Fedora Are 'Thursday Night ...
2       Hillary Clinton Met Privately With Elizabeth W...
3       An Open Letter to My Fellow Millennials on Hil...
4       Key California Lawmaker Steps Down Amid Harass...
                              ...                        
7995    Hillary Clinton’s Campaign, Cautious But Confi...
7996    What The Raiders’ Move To Vegas Means For The ...
7997                   Giants Are World Series Champions!
7998    Trump To Lift Ban On Import Of Elephant Trophi...
7999    States Set Aside Millions Of Dollars For Crime...
Name: headline, Length: 7967, dtype: object
In [215]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)  

# Fit and transform the text data
X_vectorized = vectorizer.fit_transform(X)
In [216]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels
y_encoded = label_encoder.fit_transform(y)

LogisticRegression for replationship between headline and category , adn evaluation¶

In [217]:
from sklearn.linear_model import LogisticRegression


X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_encoded, test_size=0.2, random_state=42)

# Create and train the Logistic Regression model
logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
Out[217]:
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
In [218]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions on the test set
y_pred = logreg_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
Accuracy: 0.9109159347553325
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.99      0.94      1223
           1       0.94      0.66      0.78       371

    accuracy                           0.91      1594
   macro avg       0.92      0.82      0.86      1594
weighted avg       0.91      0.91      0.91      1594

Confusion Matrix:
[[1206   17]
 [ 125  246]]
In [219]:
# Create the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Create a DataFrame for the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix, index=label_encoder.classes_, columns=label_encoder.classes_)

# Create the heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_df, annot=True, cmap='Blues', fmt='d', cbar=False)
plt.xlabel('Predicted Category')
plt.ylabel('True Category')
plt.title('Confusion Matrix')
plt.show()

randomforestclassifie to changed parameter and evaluation¶

In [220]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)  

# Fit and transform the text data
X_vectorized = vectorizer.fit_transform(X)
In [221]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels
y_encoded = label_encoder.fit_transform(y)
In [222]:
from sklearn.ensemble import RandomForestClassifie

 
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_encoded, test_size=0.2, random_state=42)

# Create and train the Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42) 
rf_model.fit(X_train, y_train)
Out[222]:
RandomForestClassifier(random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=42)
In [223]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix using heatmap 
import seaborn as sns
import matplotlib.pyplot as plt

conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix_df = pd.DataFrame(conf_matrix, index=label_encoder.classes_, columns=label_encoder.classes_)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_df, annot=True, cmap='Blues', fmt='d', cbar=False)
plt.xlabel('Predicted Category')
plt.ylabel('True Category')
plt.title('Confusion Matrix')
plt.show()
Accuracy: 0.9146800501882058
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      1223
           1       0.88      0.73      0.80       371

    accuracy                           0.91      1594
   macro avg       0.90      0.85      0.87      1594
weighted avg       0.91      0.91      0.91      1594

best model sequential model for changed parameter,and evaluation¶

In [226]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)  

# Fit and transform the text data
X_vectorized = vectorizer.fit_transform(X)
In [227]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels
y_encoded = label_encoder.fit_transform(y)
In [228]:
from tensorflow import keras
from tensorflow.keras import layers


X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_encoded, test_size=0.2, random_state=42)

# Create the sequential model
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')  
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train.toarray(), y_train, epochs=10, batch_size=32, validation_split=0.2)
Epoch 1/10
160/160 [==============================] - 1s 2ms/step - loss: 0.5322 - accuracy: 0.7428 - val_loss: 0.4181 - val_accuracy: 0.7482
Epoch 2/10
160/160 [==============================] - 0s 2ms/step - loss: 0.3301 - accuracy: 0.8474 - val_loss: 0.2549 - val_accuracy: 0.8941
Epoch 3/10
160/160 [==============================] - 0s 2ms/step - loss: 0.2083 - accuracy: 0.9176 - val_loss: 0.2194 - val_accuracy: 0.9075
Epoch 4/10
160/160 [==============================] - 0s 2ms/step - loss: 0.1653 - accuracy: 0.9425 - val_loss: 0.2262 - val_accuracy: 0.9075
Epoch 5/10
160/160 [==============================] - 0s 1ms/step - loss: 0.1382 - accuracy: 0.9492 - val_loss: 0.2254 - val_accuracy: 0.9051
Epoch 6/10
160/160 [==============================] - 0s 1ms/step - loss: 0.1188 - accuracy: 0.9541 - val_loss: 0.2323 - val_accuracy: 0.9051
Epoch 7/10
160/160 [==============================] - 0s 1ms/step - loss: 0.1141 - accuracy: 0.9594 - val_loss: 0.2433 - val_accuracy: 0.9051
Epoch 8/10
160/160 [==============================] - 0s 1ms/step - loss: 0.0939 - accuracy: 0.9651 - val_loss: 0.2614 - val_accuracy: 0.9106
Epoch 9/10
160/160 [==============================] - 0s 2ms/step - loss: 0.0805 - accuracy: 0.9714 - val_loss: 0.2704 - val_accuracy: 0.9106
Epoch 10/10
160/160 [==============================] - 0s 2ms/step - loss: 0.0707 - accuracy: 0.9753 - val_loss: 0.2902 - val_accuracy: 0.9098
In [317]:
model.save("lateruse.h5")
In [229]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test.toarray(), y_test)
print("Test Accuracy:", accuracy)
50/50 [==============================] - 0s 706us/step - loss: 0.2913 - accuracy: 0.9134
Test Accuracy: 0.9134253263473511
In [232]:
# History saves the training into a dictionary structure with the keys below
history.history.keys()
Out[232]:
dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])
In [233]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='lower right')
plt.show()
In [234]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
# plt.ylim((-0.1, 1.1))
plt.show()
In [230]:
import numpy as np

# Make predictions on the test set
y_pred_prob = model.predict(X_test.toarray())
y_pred = np.round(y_pred_prob).flatten().astype(int)

# Convert predictions back to categorical labels
y_pred_labels = label_encoder.inverse_transform(y_pred)

# Convert true labels back to categorical labels
y_true_labels = label_encoder.inverse_transform(y_test)

# Create the confusion matrix
conf_matrix = confusion_matrix(y_true_labels, y_pred_labels)

# Create a DataFrame for the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix, index=label_encoder.classes_, columns=label_encoder.classes_)
50/50 [==============================] - 0s 658us/step
In [231]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_df, annot=True, cmap='Blues', fmt='d', cbar=False)
plt.xlabel('Predicted Category')
plt.ylabel('True Category')
plt.title('Confusion Matrix')
plt.show()
In [ ]:
# the result is similar to the one i obtained for the validation set, 
# compared to the short_description,the model of "headline",the accuray , TN, TP (true posivite )are higher.

Error analysis for threee model on changed parameter¶

In [ ]:
# sequential model mistake valid
In [238]:
80+58
Out[238]:
138
In [ ]:
# logistic regression mistake valid
In [239]:
125+17
Out[239]:
142
In [ ]:
# randomforest mistake valid
In [240]:
99+37
Out[240]:
136

Merge train and validaton set¶

In [253]:
import numpy as np

# Concatenate X_train and X_valid
X_train_valid = np.concatenate((X_train, X_valid), axis=0)

# Concatenate y_train and y_valid
y_train_valid = np.concatenate((y_train, y_valid), axis=0)

perform cross validation for logisticregression¶

In [256]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)  

# Fit and transform the text data
X_train_valid_vectorized = vectorizer.fit_transform(X_train_valid)
In [257]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Create and initialize the Logistic Regression model
logreg_model = LogisticRegression()

# Perform cross-validation and get the accuracy scores

cv_scores = cross_val_score(logreg_model, X_train_valid_vectorized, y_train_valid, cv=5)

# Print the cross-validation results
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", np.mean(cv_scores))
print("Standard Deviation of CV Accuracy:", np.std(cv_scores))
Cross-Validation Scores: [0.89856373 0.89048474 0.89048474 0.88230009 0.89038634]
Mean CV Accuracy: 0.8904439293416632
Standard Deviation of CV Accuracy: 0.005143150809612093
In [ ]:
# acrossing cross validation, the model have achieved a reasonbale consistent accuracy
# reason:the mean cv accuracy indicated the overall performance is 0.89
# the small standard deviation suggests that model is relatively stable

Apply best model senqutial model to test set¶

In [306]:
import pandas as pd


data.dropna(inplace=True)


X = data['headline']
y = data['category']
X_train_plus_valid, X_test, y_train_plus_valid, y_test = train_test_split(X, y, random_state=0, test_size = 0.30, train_size = 0.7, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_plus_valid, y_train_plus_valid, random_state=0, test_size = 0.199/0.7, train_size = 0.5/0.7, stratify=y_train_plus_valid)
/Users/luqiansong/opt/anaconda3/lib/python3.9/site-packages/pandas/util/_decorators.py:311: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
In [308]:
# View the first few rows of the test set data (X_test)
print("Test set data (X_test):")
print(X_test.head())
Test set data (X_test):
1403    Can You Spot Which Part Of This Tweet Is Of Qu...
1104             Confessions Of A Tonya Harding Apologist
7078    Donald Trump's Supreme Court Pick Came Of Age ...
7899                       Week 16 Fantasy Football Focus
4175    Tom Brady Asks Why His Friendship With Donald ...
Name: headline, dtype: object
In [309]:
# View the first few rows of the test set labels (y_test)
print("Test set labels (y_test):")
print(y_test.head())
Test set labels (y_test):
1403      SPORTS
1104      SPORTS
7078    POLITICS
7899      SPORTS
4175      SPORTS
Name: category, dtype: object
In [310]:
# View the shape of the test set data and labels
print("Test set data shape:", X_test.shape)
print("Test set labels shape:", y_test.shape)
Test set data shape: (2391,)
Test set labels shape: (2391,)
In [311]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)  

# Fit and transform the text data
X_vectorized = vectorizer.fit_transform(X)
In [312]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels
y_encoded = label_encoder.fit_transform(y)
In [314]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
In [318]:
from tensorflow.keras.models import load_model

# Load the model from the saved file
model = load_model("lateruse.h5")
In [319]:
model.load_weights("lateruse.h5")
In [321]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Create a tokenizer and fit it on the headlines in X_test
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_test)

# Convert text to sequences of integers
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure they all have the same length
max_sequence_length = 1000  
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length)
In [323]:
# Load the model
model = keras.models.load_model('lateruse.h5')

# Make predictions on the test data
predictions = model.predict(X_test_padded)

# Convert probabilities to binary classes for binary classification
predicted_classes = (predictions > 0.5).astype('int32')
75/75 [==============================] - 0s 575us/step
In [337]:
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns


# Convert y_test to binary classes for binary classification
y_test_binary = (y_test == 'SPORTS').astype('int32')

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test_binary, predicted_classes)

# Visualize the confusion matrix using Seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['POLOTICS', 'SPORTS'],
            yticklabels=['POLOTICS', 'SPORTS'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

Retrain the best model using the train and validation dataset¶

In [332]:
import numpy as np

# Concatenate X_train and X_valid
X_train_valid = np.concatenate((X_train, X_valid), axis=0)

# Concatenate y_train and y_valid
y_train_valid = np.concatenate((y_train, y_valid), axis=0)
In [340]:
print("Shape of X_train_valid:", X_train_valid.shape)
print("Shape of y_train_valid:", y_train_valid.shape)
Shape of X_train_valid: (5568,)
Shape of y_train_valid: (5568,)
In [341]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)  

# Fit and transform the text data
X_vectorized = vectorizer.fit_transform(X)
In [342]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder
label_encoder = LabelEncoder()

# Fit and transform the labels
y_encoded = label_encoder.fit_transform(y)
In [346]:
from tensorflow import keras
from tensorflow.keras import layers

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Create the sequential model
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')  
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train.toarray(), y_train, epochs=10, batch_size=32, validation_split=0.2)
Epoch 1/10
128/128 [==============================] - 1s 2ms/step - loss: 0.5863 - accuracy: 0.7310 - val_loss: 0.5056 - val_accuracy: 0.7333
Epoch 2/10
128/128 [==============================] - 0s 1ms/step - loss: 0.4158 - accuracy: 0.7744 - val_loss: 0.3294 - val_accuracy: 0.8676
Epoch 3/10
128/128 [==============================] - 0s 2ms/step - loss: 0.2478 - accuracy: 0.9041 - val_loss: 0.2431 - val_accuracy: 0.8990
Epoch 4/10
128/128 [==============================] - 0s 2ms/step - loss: 0.1827 - accuracy: 0.9328 - val_loss: 0.2380 - val_accuracy: 0.9088
Epoch 5/10
128/128 [==============================] - 0s 1ms/step - loss: 0.1464 - accuracy: 0.9431 - val_loss: 0.2506 - val_accuracy: 0.9098
Epoch 6/10
128/128 [==============================] - 0s 2ms/step - loss: 0.1326 - accuracy: 0.9527 - val_loss: 0.2622 - val_accuracy: 0.9069
Epoch 7/10
128/128 [==============================] - 0s 2ms/step - loss: 0.1133 - accuracy: 0.9590 - val_loss: 0.2884 - val_accuracy: 0.9029
Epoch 8/10
128/128 [==============================] - 0s 2ms/step - loss: 0.0941 - accuracy: 0.9667 - val_loss: 0.3088 - val_accuracy: 0.9020
Epoch 9/10
128/128 [==============================] - 0s 1ms/step - loss: 0.0813 - accuracy: 0.9696 - val_loss: 0.3250 - val_accuracy: 0.9029
Epoch 10/10
128/128 [==============================] - 0s 2ms/step - loss: 0.0745 - accuracy: 0.9735 - val_loss: 0.3421 - val_accuracy: 0.9059
In [347]:
# Evaluate the model
loss, accuracy = model.evaluate(X_valid.toarray(), y_valid)
print("Test Accuracy:", accuracy)
40/40 [==============================] - 0s 723us/step - loss: 0.2468 - accuracy: 0.9169
Test Accuracy: 0.9168627262115479
In [348]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='lower right')
plt.show()
In [349]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
# plt.ylim((-0.1, 1.1))
plt.show()
In [350]:
import numpy as np

# Make predictions on the test set
y_pred_prob = model.predict(X_test.toarray())
y_pred = np.round(y_pred_prob).flatten().astype(int)

# Convert predictions back to categorical labels
y_pred_labels = label_encoder.inverse_transform(y_pred)

# Convert true labels back to categorical labels
y_true_labels = label_encoder.inverse_transform(y_test)

# Create the confusion matrix
conf_matrix = confusion_matrix(y_true_labels, y_pred_labels)

# Create a DataFrame for the confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix, index=label_encoder.classes_, columns=label_encoder.classes_)
50/50 [==============================] - 0s 622us/step
In [351]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_df, annot=True, cmap='Blues', fmt='d', cbar=False)
plt.xlabel('Predicted Category')
plt.ylabel('True Category')
plt.title('Confusion Matrix')
plt.show()
In [ ]:
# training the model wiht more data get higher accuracy
# the TP (true positive )adn TN is increasing and the FN AND FP is decreasing